• PROJECT OBJECTIVE: To Demonstrate the ability to fetch, process and leverage data to generate useful predictions by training Supervised Learning algorithms.
# import packages
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')
a. Read all the 3 CSV files as DataFrame and store them into 3 separate variables.
# read all files
Norm=pd.read_csv("Normal.csv")
TyS=pd.read_csv("Type_S.csv")
TyH=pd.read_csv("Type_H.csv")
# display the fetched data
display("Data: Normal.csv",Norm)
display("Data: Type_S.csv",TyS)
display("Data: Type_H.csv",TyH)
'Data: Normal.csv'
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 38.505273 | 16.964297 | 35.112814 | 21.540976 | 127.632875 | 7.986683 | Normal |
| 1 | 54.920858 | 18.968430 | 51.601455 | 35.952428 | 125.846646 | 2.001642 | Normal |
| 2 | 44.362490 | 8.945435 | 46.902096 | 35.417055 | 129.220682 | 4.994195 | Normal |
| 3 | 48.318931 | 17.452121 | 48.000000 | 30.866809 | 128.980308 | -0.910941 | Normal |
| 4 | 45.701789 | 10.659859 | 42.577846 | 35.041929 | 130.178314 | -3.388910 | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | 47.903565 | 13.616688 | 36.000000 | 34.286877 | 117.449062 | -4.245395 | Normal |
| 96 | 53.936748 | 20.721496 | 29.220534 | 33.215251 | 114.365845 | -0.421010 | Normal |
| 97 | 61.446597 | 22.694968 | 46.170347 | 38.751628 | 125.670725 | -2.707880 | Normal |
| 98 | 45.252792 | 8.693157 | 41.583126 | 36.559635 | 118.545842 | 0.214750 | Normal |
| 99 | 33.841641 | 5.073991 | 36.641233 | 28.767649 | 123.945244 | -0.199249 | Normal |
100 rows × 7 columns
'Data: Type_S.csv'
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 74.377678 | 32.053104 | 78.772013 | 42.324573 | 143.560690 | 56.125906 | Type_S |
| 1 | 89.680567 | 32.704435 | 83.130732 | 56.976132 | 129.955476 | 92.027277 | Type_S |
| 2 | 44.529051 | 9.433234 | 52.000000 | 35.095817 | 134.711772 | 29.106575 | Type_S |
| 3 | 77.690577 | 21.380645 | 64.429442 | 56.309932 | 114.818751 | 26.931841 | Type_S |
| 4 | 76.147212 | 21.936186 | 82.961502 | 54.211027 | 123.932010 | 10.431972 | Type_S |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 145 | 80.111572 | 33.942432 | 85.101608 | 46.169139 | 125.593624 | 100.292107 | Type_S |
| 146 | 95.480229 | 46.550053 | 59.000000 | 48.930176 | 96.683903 | 77.283072 | Type_S |
| 147 | 74.094731 | 18.823727 | 76.032156 | 55.271004 | 128.405731 | 73.388216 | Type_S |
| 148 | 87.679087 | 20.365613 | 93.822416 | 67.313473 | 120.944829 | 76.730629 | Type_S |
| 149 | 48.259920 | 16.417462 | 36.329137 | 31.842457 | 94.882336 | 28.343799 | Type_S |
150 rows × 7 columns
'Data: Type_H.csv'
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 63.027817 | 22.552586 | 39.609117 | 40.475232 | 98.672917 | -0.254400 | Type_H |
| 1 | 39.056951 | 10.060991 | 25.015378 | 28.995960 | 114.405425 | 4.564259 | Type_H |
| 2 | 68.832021 | 22.218482 | 50.092194 | 46.613539 | 105.985135 | -3.530317 | Type_H |
| 3 | 69.297008 | 24.652878 | 44.311238 | 44.644130 | 101.868495 | 11.211523 | Type_H |
| 4 | 49.712859 | 9.652075 | 28.317406 | 40.060784 | 108.168725 | 7.918501 | Type_H |
| 5 | 40.250200 | 13.921907 | 25.124950 | 26.328293 | 130.327871 | 2.230652 | Type_H |
| 6 | 53.432928 | 15.864336 | 37.165934 | 37.568592 | 120.567523 | 5.988551 | Type_H |
| 7 | 45.366754 | 10.755611 | 29.038349 | 34.611142 | 117.270067 | -10.675871 | Type_H |
| 8 | 43.790190 | 13.533753 | 42.690814 | 30.256437 | 125.002893 | 13.289018 | Type_H |
| 9 | 36.686353 | 5.010884 | 41.948751 | 31.675469 | 84.241415 | 0.664437 | Type_H |
| 10 | 49.706610 | 13.040974 | 31.334500 | 36.665635 | 108.648265 | -7.825986 | Type_H |
| 11 | 31.232387 | 17.715819 | 15.500000 | 13.516568 | 120.055399 | 0.499751 | Type_H |
| 12 | 48.915551 | 19.964556 | 40.263794 | 28.950995 | 119.321358 | 8.028895 | Type_H |
| 13 | 53.572170 | 20.460828 | 33.100000 | 33.111342 | 110.966698 | 7.044803 | Type_H |
| 14 | 57.300227 | 24.188885 | 47.000000 | 33.111342 | 116.806587 | 5.766947 | Type_H |
| 15 | 44.318907 | 12.537992 | 36.098763 | 31.780915 | 124.115836 | 5.415825 | Type_H |
| 16 | 63.834982 | 20.362507 | 54.552434 | 43.472475 | 112.309491 | -0.622527 | Type_H |
| 17 | 31.276012 | 3.144669 | 32.562996 | 28.131342 | 129.011418 | 3.623020 | Type_H |
| 18 | 38.697912 | 13.444749 | 31.000000 | 25.253163 | 123.159251 | 1.429186 | Type_H |
| 19 | 41.729963 | 12.254074 | 30.122586 | 29.475889 | 116.585706 | -1.244402 | Type_H |
| 20 | 43.922840 | 14.177959 | 37.832547 | 29.744881 | 134.461016 | 6.451648 | Type_H |
| 21 | 54.919443 | 21.062332 | 42.200000 | 33.857110 | 125.212716 | 2.432561 | Type_H |
| 22 | 63.073611 | 24.413803 | 54.000000 | 38.659808 | 106.424329 | 15.779697 | Type_H |
| 23 | 45.540790 | 13.069598 | 30.298321 | 32.471192 | 117.980830 | -4.987130 | Type_H |
| 24 | 36.125683 | 22.758753 | 29.000000 | 13.366931 | 115.577116 | -3.237562 | Type_H |
| 25 | 54.124920 | 26.650489 | 35.329747 | 27.474432 | 121.447011 | 1.571205 | Type_H |
| 26 | 26.147921 | 10.759454 | 14.000000 | 15.388468 | 125.203296 | -10.093108 | Type_H |
| 27 | 43.580964 | 16.508884 | 47.000000 | 27.072080 | 109.271634 | 8.992816 | Type_H |
| 28 | 44.551012 | 21.931147 | 26.785916 | 22.619865 | 111.072920 | 2.652321 | Type_H |
| 29 | 66.879211 | 24.891999 | 49.278597 | 41.987212 | 113.477018 | -2.005892 | Type_H |
| 30 | 50.819268 | 15.402213 | 42.528939 | 35.417055 | 112.192804 | 10.869566 | Type_H |
| 31 | 46.390260 | 11.079047 | 32.136553 | 35.311213 | 98.774546 | 6.386832 | type_h |
| 32 | 44.936675 | 17.443838 | 27.780576 | 27.492837 | 117.980324 | 5.569620 | type_h |
| 33 | 38.663257 | 12.986441 | 40.000000 | 25.676816 | 124.914118 | 2.703008 | type_h |
| 34 | 59.595540 | 31.998244 | 46.560252 | 27.597296 | 119.330354 | 1.474286 | type_h |
| 35 | 31.484218 | 7.826221 | 24.284818 | 23.657997 | 113.833145 | 4.393080 | type_h |
| 36 | 32.090987 | 6.989378 | 35.998198 | 25.101609 | 132.264735 | 6.413428 | type_h |
| 37 | 35.703458 | 19.443253 | 20.700000 | 16.260205 | 137.540613 | -0.263490 | type_h |
| 38 | 55.843286 | 28.847448 | 47.690543 | 26.995838 | 123.311845 | 2.812427 | type_h |
| 39 | 52.419385 | 19.011561 | 35.872660 | 33.407825 | 116.559771 | 1.694705 | type_h |
| 40 | 35.492446 | 11.701672 | 15.590363 | 23.790774 | 106.938852 | -3.460358 | type_h |
| 41 | 46.442078 | 8.395036 | 29.037230 | 38.047043 | 115.481405 | 2.045476 | type_h |
| 42 | 53.854798 | 19.230643 | 32.779060 | 34.624155 | 121.670915 | 5.329843 | type_h |
| 43 | 66.285394 | 26.327845 | 47.500000 | 39.957549 | 121.219684 | -0.799624 | type_h |
| 44 | 56.030218 | 16.297915 | 62.275275 | 39.732303 | 114.023117 | -2.325684 | type_h |
| 45 | 50.912440 | 23.015169 | 47.000000 | 27.897271 | 117.422259 | -2.526702 | type_h |
| 46 | 48.332638 | 22.227784 | 36.181993 | 26.104854 | 117.384625 | 6.481709 | type_h |
| 47 | 41.352504 | 16.577364 | 30.706191 | 24.775141 | 113.266675 | -4.497958 | type_h |
| 48 | 40.557357 | 17.977784 | 34.000000 | 22.579573 | 121.046246 | -1.537383 | type_h |
| 49 | 41.767732 | 17.899402 | 20.030886 | 23.868330 | 118.363389 | 2.062963 | type_h |
| 50 | 55.285852 | 20.440118 | 34.000000 | 34.845733 | 115.877017 | 3.558372 | type_h |
| 51 | 74.433593 | 41.557331 | 27.700000 | 32.876262 | 107.949304 | 5.000089 | type_h |
| 52 | 50.209670 | 29.760122 | 36.104007 | 20.449548 | 128.292515 | 5.740614 | type_h |
| 53 | 30.149936 | 11.917445 | 34.000000 | 18.232491 | 112.684141 | 11.463223 | type_h |
| 54 | 41.171680 | 17.321206 | 33.469403 | 23.850474 | 116.377889 | -9.569250 | Type_H |
| 55 | 47.657730 | 13.277385 | 36.679985 | 34.380345 | 98.249781 | 6.273012 | Type_H |
| 56 | 43.349606 | 7.467469 | 28.065483 | 35.882137 | 112.776187 | 5.753277 | Type_H |
| 57 | 46.855781 | 15.351514 | 38.000000 | 31.504267 | 116.250917 | 1.662706 | Type_H |
| 58 | 43.203185 | 19.663146 | 35.000000 | 23.540039 | 124.846109 | -2.919076 | Type_H |
| 59 | 48.109236 | 14.930725 | 35.564683 | 33.178512 | 124.056452 | 7.947905 | Type_H |
b. Print Shape and columns of all the 3 DataFrames.
print("Shape of Normal Dataset is %d rows by %d columns"%(Norm.shape[0],Norm.shape[1]))
print("Shape of Type_S Dataset is %d rows by %d columns"%(TyS.shape[0],TyS.shape[1]))
print("Shape of Type_H Dataset is %d rows by %d columns"%(TyH.shape[0],TyH.shape[1]))
Shape of Normal Dataset is 100 rows by 7 columns Shape of Type_S Dataset is 150 rows by 7 columns Shape of Type_H Dataset is 60 rows by 7 columns
print("Colums of Normal Dataset:",list(Norm.columns))
print("Colums of Type_S Dataset:",list(TyS.columns))
print("Colums of Type_H Dataset:",list(TyH.columns))
Colums of Normal Dataset: ['P_incidence', 'P_tilt', 'L_angle', 'S_slope', 'P_radius', 'S_Degree', 'Class'] Colums of Type_S Dataset: ['P_incidence', 'P_tilt', 'L_angle', 'S_slope', 'P_radius', 'S_Degree', 'Class'] Colums of Type_H Dataset: ['P_incidence', 'P_tilt', 'L_angle', 'S_slope', 'P_radius', 'S_Degree', 'Class']
c. Compare Column names of all the 3 DataFrames and clearly write observations.
d. Print DataTypes of all the 3 DataFrames.
display("Normal",Norm.dtypes)
display("Type_S",TyS.dtypes)
display("Type_H",TyH.dtypes)
'Normal'
P_incidence float64 P_tilt float64 L_angle float64 S_slope float64 P_radius float64 S_Degree float64 Class object dtype: object
'Type_S'
P_incidence float64 P_tilt float64 L_angle float64 S_slope float64 P_radius float64 S_Degree float64 Class object dtype: object
'Type_H'
P_incidence float64 P_tilt float64 L_angle float64 S_slope float64 P_radius float64 S_Degree float64 Class object dtype: object
e. Observe and share variation in ‘Class’ feature of all the 3 DaraFrames.
display(Norm.Class.value_counts())
display(TyS.Class.value_counts())
display(TyH.Class.value_counts())
Normal 73 Nrmal 27 Name: Class, dtype: int64
Type_S 133 tp_s 17 Name: Class, dtype: int64
Type_H 37 type_h 23 Name: Class, dtype: int64
Though all datapoints in a given DataFrame corresponds to a particular class of patients, the value in the 'Class' attribute has differences of spelling/uppercase/lowercase for the same class name, vis.,
This will be misinterpretted as different classes by any classifier machine learning algorithm. Hence needs to be corrected.
a. Unify all the variations in ‘Class’ feature for all the 3 DataFrames.
Based on the attributes and cross referencing public dataset, these data belong to a study of orthopediac patients built by Dr. Henrique da Mota
Accordingly, let us correct the class information as follows
Norm.Class='Normal'
TyS.Class='Spondylolisthesis'
TyH.Class='Disk Hernia'
# lets recheck the unique values
display(Norm.Class.value_counts())
display(TyS.Class.value_counts())
display(TyH.Class.value_counts())
Normal 100 Name: Class, dtype: int64
Spondylolisthesis 150 Name: Class, dtype: int64
Disk Hernia 60 Name: Class, dtype: int64
b. Combine all the 3 DataFrames to form a single DataFrame
ortho=Norm.merge(TyS,how='outer').merge(TyH,how='outer').copy() # cascading outer join
ortho.shape # to confirm record addition
(310, 7)
ortho.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 310 entries, 0 to 309 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 310 non-null float64 1 P_tilt 310 non-null float64 2 L_angle 310 non-null float64 3 S_slope 310 non-null float64 4 P_radius 310 non-null float64 5 S_Degree 310 non-null float64 6 Class 310 non-null object dtypes: float64(6), object(1) memory usage: 19.4+ KB
c. Print 5 random samples of this DataFrame
ortho.sample(n=5)
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 149 | 68.613001 | 15.082235 | 63.014696 | 53.530766 | 123.431174 | 39.497987 | Spondylolisthesis |
| 20 | 65.611802 | 23.137919 | 62.582179 | 42.473883 | 124.128001 | -4.083298 | Normal |
| 307 | 46.855781 | 15.351514 | 38.000000 | 31.504267 | 116.250917 | 1.662706 | Disk Hernia |
| 8 | 51.624672 | 15.969344 | 35.000000 | 35.655328 | 129.385308 | 1.009228 | Normal |
| 270 | 43.922840 | 14.177959 | 37.832547 | 29.744881 | 134.461016 | 6.451648 | Disk Hernia |
d. Print Feature-wise percentage of Null values.
n_chk=pd.DataFrame(columns=ortho.columns,index=["NA %","NULL %"])
l = ortho.shape[0] # number of records
for i in ortho.columns:
c_na = ortho[i].isna().sum() # sum of #NA
n_chk.loc["NA %",i]=c_na/l*100 # % of #NA
c_null = ortho[i].isnull().sum() # sum of #NULL
n_chk.loc["NULL %",i]=c_null/l*100 # % of #NULL
n_chk
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| NA % | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| NULL % | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Each column contains only valid data, as was also mentioned in ortho.info() as 310 non-null entries against each column
e. Check 5-point summary of the new DataFrame.
ortho.describe()[3:] # 5 point summary from describe command
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| min | 26.147921 | -6.554948 | 14.000000 | 13.366931 | 70.082575 | -11.058179 |
| 25% | 46.430294 | 10.667069 | 37.000000 | 33.347122 | 110.709196 | 1.603727 |
| 50% | 58.691038 | 16.357689 | 49.562398 | 42.404912 | 118.268178 | 11.767934 |
| 75% | 72.877696 | 22.120395 | 63.000000 | 52.695888 | 125.467674 | 41.287352 |
| max | 129.834041 | 49.431864 | 125.742385 | 121.429566 | 163.071041 | 418.543082 |
The features are located around a wide range of medians (11 to 120) also each attributes are at different scales and ranges, appropriate preprocessing is necessary before modelling
a. Visualize a heatmap to understand correlation between all features
gdata=ortho.select_dtypes('float').corr()
text=np.around(gdata,decimals=2)
fig=ff.create_annotated_heatmap(z=np.array(gdata),annotation_text=np.array(text),
x=list(gdata.index),y=list(gdata.columns),
showscale=True,colorscale='Spectral',reversescale=False,#'RdYlBu'
zmin=-1, zmax=1,font_colors = ['grey','#ff9999'])#'#C71585'
fig.update_layout(height=650,width=700)
fig.update_yaxes(tickangle=270)
fig.show()
b. Share insights on correlation.
i. Features having stronger correlation with correlation value.
ii. Features having weaker correlation with correlation value.
We could find low multicolinearity (attributes are not correlated heavily) as a dataset, which is a better for regressions to explain featurewise influence on resultant
Lets mention the top 2 & last 2 pairs -
there are no significant negative correlation pairs as strong as above
the feature pairs with weaker corrlation are -
though there are lower numeric values of correlations, they are higher negative correlations, hence the above 2 qualify as the least correlated pairs
c. Visualize a pairplot with 3 classes distinguished by colors and share insights.
dims=ortho.select_dtypes(exclude='object').columns
fig = px.scatter_matrix(ortho,dimensions=dims,color='Class',symbol='Class',opacity=0.5,height=800)
fig.show()
# the above pairplot misses a diagonal histogram / kde plot
# hence lets visualise the kde plot below
fig = make_subplots(rows=2,cols=3,subplot_titles=ortho.select_dtypes('float').columns)
row=1
col=1
for i in ortho.select_dtypes('float').columns:
a=ortho.loc[ortho.Class=='Normal',i]
b=ortho.loc[ortho.Class=='Spondylolisthesis',i]
c=ortho.loc[ortho.Class=='Disk Hernia',i]
fig2 = ff.create_distplot([a,b,c],['Normal', 'Spondylolisthesis', 'Disk Hernia'],
curve_type='kde',show_hist=False,show_rug=False)
fig.add_trace(go.Scatter(fig2.data[0]),row,col)
fig.add_trace(go.Scatter(fig2.data[1]),row,col)
fig.add_trace(go.Scatter(fig2.data[2]),row,col)
col+=1
if col==4:
col=1
row=2
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()
From the pairplot and kde distribution plot, we could infer the following considerable relation pairs would be
The color grouping helps identify the following
hopefully the classes are linearly separable in higher dimensions.
d. Visualize a jointplot for ‘P_incidence’ and ‘S_slope’ and share insights.
fig = ff.create_2d_density(x=ortho.P_incidence,y=ortho.S_slope,ncontours=10,title='')
fig.update_xaxes(title='P_incidence',nticks=10)
fig.update_yaxes(title='S_slope',nticks=10)
fig.show()
As was seen earlier in correlation heat map visualisation and pairplot visualisation,
e. Visualize a boxplot to check distribution of the features and share insights.
gdata=ortho.select_dtypes('float')
fig = make_subplots(rows=6,cols=1)
n=1
for i in gdata.columns:
fig.add_trace(go.Box(x=gdata[i],name=i,hovertemplate='%{x}',jitter=1),n,1)
n+=1
fig.show()
# to throw more light on the distribution along with the above box plot, lets measure some statistics
stats=(ortho.describe()[1:3]).merge((ortho.std()/ortho.mean()).to_frame().T,how='outer')
stats.index=["mean","std","CoV"]
stats
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| mean | 60.496653 | 17.542822 | 51.930930 | 42.953831 | 117.920655 | 26.296694 |
| std | 17.236520 | 10.008330 | 18.554064 | 13.423102 | 13.317377 | 37.559027 |
| CoV | 0.284917 | 0.570509 | 0.357283 | 0.312501 | 0.112935 | 1.428279 |
almost every feature has values centered around its median, with a few outliers, except for S_Degree, which is heavily right skewed, with one extreme offset outlier
the same is witnessed in Coefficient of Variance value of 1.42 of S_Degree
interestingly, P_radius seems to be have outliers on either side
a. Split data into X and Y.
# collect all independant variable in X
X = ortho.drop('Class',axis=1).copy()
# Move Class data to Y
Y = ortho.Class.copy()
b. Split data into train and test with 80:20 proportion.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.20, # split ratio of 80:20
random_state=129) # random seed
c. Train a Supervised Learning Classification base model using KNN classifier.
knc = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance')
knc.fit(X_train, Y_train) # modelling
pred=knc.predict(X_test) # prediction based on above mode
d. Print all the possible classification metrics for both train and test data.
z=pd.DataFrame(metrics.confusion_matrix(Y_test, pred))
fig=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(Y_test.unique())),y=list(np.sort(Y_test.unique())),
colorscale='Mint',font_colors = ['grey','white'])
fig.update_layout(height=450,width=450,title_text="CONFUSION MATRIX")
fig.update_xaxes(title_text="PREDICTED")
fig.update_yaxes(title_text="TRUE",tickangle=270)
fig.show()
print("CLASSIFICATION REPORT\n",metrics.classification_report(Y_test, pred, digits=3))
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.545 0.750 0.632 8
Normal 0.727 0.800 0.762 20
Spondylolisthesis 1.000 0.853 0.921 34
accuracy 0.823 62
macro avg 0.758 0.801 0.771 62
weighted avg 0.853 0.823 0.832 62
Being a medical test use case, recall scores are to be considered a critical parameter of evaluation.
A recall value of 0.75 for Disk Hernia and 0.853 for Spondylolisthesis needs to be improved more.
a. Tune the parameters/hyperparameters to improve the performance of the base model.
# before we make improvements, lets create a function to evaluate models
# score logs
col=ortho.Class.unique()
precision=pd.DataFrame(columns=col)
recall=pd.DataFrame(columns=col)
f1=pd.DataFrame(columns=col)
accuracy=pd.DataFrame(columns=["accuracy"])
# scoring logger
def scorer(clf,name,X_train, X_test, Y_train, Y_test):
from sklearn import metrics
#global dataframes
global precision, recall, f1, accuracy
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
#model evaluation scores log
col=ortho.Class.unique()
precision=precision.append(pd.DataFrame(metrics.precision_score(Y_test, pred, average=None, labels=col),
index=col,columns=[name]).T)
recall=recall.append(pd.DataFrame(metrics.recall_score(Y_test, pred, average=None, labels=col),
index=col,columns=[name]).T)
f1=f1.append(pd.DataFrame(metrics.f1_score(Y_test, pred, average=None, labels=col),
index=col,columns=[name]).T)
accuracy=accuracy.append(pd.DataFrame(metrics.accuracy_score(Y_test, pred),
index=[name],columns=["accuracy"]))
return pred
# standard report format
def reporter(Y_test, pred):
from sklearn import metrics
import plotly.figure_factory as ff
import numpy as np
#confusion matrix visualisation
z=pd.DataFrame(metrics.confusion_matrix(Y_test, pred))
fig=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_test))),y=list(np.sort(np.unique(Y_test))),
colorscale='Mint',font_colors = ['grey','white'])
fig.update_layout(height=450,width=450,title_text="CONFUSION MATRIX")
fig.update_xaxes(title_text="PREDICTED")
fig.update_yaxes(title_text="TRUE",tickangle=270)
print("CLASSIFICATION REPORT\n",metrics.classification_report(Y_test, pred, digits=3))
fig.show()
# compare scores
def compare():
from plotly.subplots import make_subplots
import plotly.graph_objects as go
global precision, recall, f1, accuracy
fig = make_subplots(1,4,subplot_titles=["Precision","Recall","F1 score","Accuracy"])
fig.add_trace(go.Scatter(x=precision.index,y=precision["Normal"],name="Normal",line={'color':"#FF6347"}),1,1)
fig.add_trace(go.Scatter(x=precision.index,y=precision["Spondylolisthesis"],name="Spondylolisthesis",line={'color':"#1E90FF"}),1,1)
fig.add_trace(go.Scatter(x=precision.index,y=precision["Disk Hernia"],name="Disk Hernia",line={'color':"#00FA9A"}),1,1)
fig.add_trace(go.Scatter(x=recall.index,y=recall["Normal"],name="Normal",line={'color':"#FF6347"}),1,2)
fig.add_trace(go.Scatter(x=recall.index,y=recall["Spondylolisthesis"],name="Spondylolisthesis",line={'color':"#1E90FF"}),1,2)
fig.add_trace(go.Scatter(x=recall.index,y=recall["Disk Hernia"],name="Disk Hernia",line={'color':"#00FA9A"}),1,2)
fig.add_trace(go.Scatter(x=f1.index,y=f1["Normal"],name="Normal",line={'color':"#FF6347"}),1,3)
fig.add_trace(go.Scatter(x=f1.index,y=f1["Spondylolisthesis"],name="Spondylolisthesis",line={'color':"#1E90FF"}),1,3)
fig.add_trace(go.Scatter(x=f1.index,y=f1["Disk Hernia"],name="Disk Hernia",line={'color':"#00FA9A"}),1,3)
fig.add_trace(go.Scatter(x=accuracy.index,y=accuracy.accuracy,line={'color':"#FF6347"}),1,4)
fig.update_layout(showlegend=False)
fig.show()
# record the first model for comparison
pred=scorer(knc,"Maiden",X_train, X_test, Y_train, Y_test)
# clear previous data
del X_train, X_test, Y_train, Y_test
# let us investigate case by case
# heavy offset outlier in S_degree
display(ortho.loc[ortho.S_Degree>200])
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 155 | 129.834041 | 8.404475 | 48.384057 | 121.429566 | 107.690466 | 418.543082 | Spondylolisthesis |
# Lets drop only row 155 and recreate the model
X = ortho.drop('Class',axis=1).drop(155).copy()
Y = ortho.drop(155).Class.copy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20,random_state=129)
knc2 = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' ) #model no 2
pred=scorer(knc2,"outlierDropped",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.700 0.538 0.609 13
Normal 0.708 0.850 0.773 20
Spondylolisthesis 1.000 0.966 0.982 29
accuracy 0.839 62
macro avg 0.803 0.785 0.788 62
weighted avg 0.843 0.839 0.836 62
Accuracy : 0.823-->0.839
Spondylolisthesis recall : 0.853 --> 0.966
while above both have improved, Disk Hernia recall has dropped 0.750 --> 0.538
given that accuracy improves while recall reduces, lets not decide on this single record deletion
after studying other options, lastly lets attempt the outlier deletion for decision making.
# clear previous data
del X_train, X_test, Y_train, Y_test
# Next, lets scale the features to reduce weightage influence
scl=StandardScaler()
# standardise X
X = pd.DataFrame(scl.fit_transform(ortho.drop('Class',axis=1)),columns=ortho.drop('Class',axis=1).columns)
Y = ortho.select_dtypes(include='object')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20,random_state=129)
knc3 = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' ) #model no 3
pred=scorer(knc3,"Standardised",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.556 0.625 0.588 8
Normal 0.850 0.850 0.850 20
Spondylolisthesis 0.970 0.941 0.955 34
accuracy 0.871 62
macro avg 0.792 0.805 0.798 62
weighted avg 0.878 0.871 0.874 62
standardisation has improved accuracy and as a standard measure for any analytics
we would continue using standardised data
Y_train.value_counts()
Class Spondylolisthesis 116 Normal 80 Disk Hernia 52 dtype: int64
# clear previous data
del X_train, X_test, Y_train, Y_test
# Let us look in to the imbalance of data,
# though not a heavy imbalance, lets check if balancing helps
ortho.Class.value_counts()
Spondylolisthesis 150 Normal 100 Disk Hernia 60 Name: Class, dtype: int64
Disk Hernia is of very low proportions
# lets build on the standardised data model above
scl=StandardScaler()
X = pd.DataFrame(scl.fit_transform(ortho.drop('Class',axis=1)),columns=ortho.drop('Class',axis=1).columns)
Y = ortho.select_dtypes(include='object')
X_tr, X_test, Y_tr, Y_test = train_test_split(X, Y,test_size=0.20,random_state=129)
#synthetic minority over sampling technique
# upsample all non majority dataclasses
sm = SMOTE(sampling_strategy='not majority', random_state=129)
X_train, Y_train = sm.fit_resample(X_tr,Y_tr)
# before upsampling
Y_tr.value_counts()
Class Spondylolisthesis 116 Normal 80 Disk Hernia 52 dtype: int64
#after upsampling
Y_train.value_counts()
Class Disk Hernia 116 Normal 116 Spondylolisthesis 116 dtype: int64
clearly Normal & Disk Hernia classes are upsampled
# lets fit classifier
knc4 = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' ) #model no 4
pred=scorer(knc4,"SMOTE",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.583 0.875 0.700 8
Normal 0.889 0.800 0.842 20
Spondylolisthesis 1.000 0.941 0.970 34
accuracy 0.887 62
macro avg 0.824 0.872 0.837 62
weighted avg 0.910 0.887 0.894 62
The data balancing has helped to improve the accuracy further , and recall improvement for Disk Hernia
Lets try ADASYN balancing if better results would be arrived
# lets try another balancing technique
scl=StandardScaler()
X = pd.DataFrame(scl.fit_transform(ortho.drop('Class',axis=1)),columns=ortho.drop('Class',axis=1).columns)
Y = ortho.select_dtypes(include='object')
X_tr, X_test, Y_tr, Y_test = train_test_split(X, Y,test_size=0.20,random_state=129)
#Adaptive synthetic sampling
ada = ADASYN(sampling_strategy='not majority', random_state=129)
X_train, Y_train = ada.fit_resample(X_tr,Y_tr)
knc5 = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' ) #model no 5
pred=scorer(knc5,"ADASYN",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.545 0.750 0.632 8
Normal 0.800 0.800 0.800 20
Spondylolisthesis 0.968 0.882 0.923 34
accuracy 0.839 62
macro avg 0.771 0.811 0.785 62
weighted avg 0.859 0.839 0.846 62
ADASYN based balancing caused further loss of accuracy & recall scores
Hence we would stick with SMOTE going forwards for this usecase
scl=StandardScaler()
X = pd.DataFrame(scl.fit_transform(ortho.drop('Class',axis=1)),columns=ortho.drop('Class',axis=1).columns)
Y = ortho.select_dtypes(include='object')
X_tr, X_test, Y_tr, Y_test = train_test_split(X, Y,test_size=0.20,random_state=129)
# upsample all non majority dataclasses
sm = SMOTE(sampling_strategy='not majority', random_state=129)
X_train, Y_train = sm.fit_resample(X_tr,Y_tr)
knc6 = KNeighborsClassifier() #model no 6
k=np.arange(3,31,2,dtype=int)
w=['uniform', 'distance']
a=['ball_tree', 'kd_tree', 'brute']
l=np.arange(0,10,1,dtype=int)
p={'n_neighbors':k,'weights':w,'algorithm':a,'leaf_size':l}
gs = GridSearchCV(estimator=knc6, param_grid=p)
gs.fit(X_train, Y_train)
GridSearchCV(estimator=KNeighborsClassifier(),
param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'n_neighbors': array([ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]),
'weights': ['uniform', 'distance']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=KNeighborsClassifier(),
param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'n_neighbors': array([ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]),
'weights': ['uniform', 'distance']})KNeighborsClassifier()
KNeighborsClassifier()
# lets list the best parameters
gs.best_params_
{'algorithm': 'ball_tree',
'leaf_size': 1,
'n_neighbors': 5,
'weights': 'distance'}
gs.best_params_.get('algorithm')
'ball_tree'
# lets put back these parameters to our model and compare with our previous models
knc6 = KNeighborsClassifier(algorithm=gs.best_params_.get('algorithm'),
leaf_size=gs.best_params_.get('leaf_size'),
n_neighbors=gs.best_params_.get('n_neighbors'),
weights=gs.best_params_.get('weights')) #model no 6
pred=scorer(knc6,"Tuned",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.583 0.875 0.700 8
Normal 0.889 0.800 0.842 20
Spondylolisthesis 1.000 0.941 0.970 34
accuracy 0.887 62
macro avg 0.824 0.872 0.837 62
weighted avg 0.910 0.887 0.894 62
Hyperparameter tuning shows results similar to non tuned model (compared to SMOTE balanced model)
in this use case our initial hyperparameters were incidentally the best
Lets see if we could further improve our results
# lets check if Feature Engineering helps further
# using polynomial feature interactions
scl=StandardScaler()
X = pd.DataFrame(scl.fit_transform(ortho.drop('Class',axis=1)),columns=ortho.drop('Class',axis=1).columns)
Y = ortho.select_dtypes(include='object')
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X)
X_tr, X_test, Y_tr, Y_test = train_test_split(X_poly, Y,test_size=0.20,random_state=129)
# upsample all non majority dataclasses
sm = SMOTE(sampling_strategy='not majority', random_state=129)
X_train, Y_train = sm.fit_resample(X_tr,Y_tr)
knc7=KNeighborsClassifier()
k=np.arange(3,31,2,dtype=int)
w=['uniform', 'distance']
a=['ball_tree', 'kd_tree', 'brute']
l=np.arange(0,10,1,dtype=int)
p={'n_neighbors':k,'weights':w,'algorithm':a,'leaf_size':l}
gs = GridSearchCV(estimator=knc7, param_grid=p)
gs.fit(X_train, Y_train)
knc7 = KNeighborsClassifier(algorithm=gs.best_params_.get('algorithm'),
leaf_size=gs.best_params_.get('leaf_size'),
n_neighbors=gs.best_params_.get('n_neighbors'),
weights=gs.best_params_.get('weights')) #model no 7
pred=scorer(knc7,"Poly",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.583 0.875 0.700 8
Normal 0.800 0.800 0.800 20
Spondylolisthesis 1.000 0.882 0.938 34
accuracy 0.855 62
macro avg 0.794 0.852 0.812 62
weighted avg 0.882 0.855 0.863 62
Polynomial featuring has failed to improve reults, hence lets drop the concept
As mentioned earlier, to try outlier deletion with the tuning model, lets try
scl=StandardScaler()
X = pd.DataFrame(scl.fit_transform(ortho.drop('Class',axis=1).drop(155)),columns=ortho.drop('Class',axis=1).columns)
Y = ortho.select_dtypes(include='object').drop(155)
X_tr, X_test, Y_tr, Y_test = train_test_split(X, Y,test_size=0.20,random_state=129)
# upsample all non majority dataclasses
sm = SMOTE(sampling_strategy='not majority', random_state=129)
X_train, Y_train = sm.fit_resample(X_tr,Y_tr)
knc8=KNeighborsClassifier()
k=np.arange(3,31,2,dtype=int)
w=['uniform', 'distance']
a=['ball_tree', 'kd_tree', 'brute']
l=np.arange(0,10,1,dtype=int)
p={'n_neighbors':k,'weights':w,'algorithm':a,'leaf_size':l}
gs = GridSearchCV(estimator=knc8, param_grid=p)
gs.fit(X_train, Y_train)
knc8 = KNeighborsClassifier(algorithm=gs.best_params_.get('algorithm'),
leaf_size=gs.best_params_.get('leaf_size'),
n_neighbors=gs.best_params_.get('n_neighbors'),
weights=gs.best_params_.get('weights')) #model no 8
pred=scorer(knc8,"outlier_rem",X_train, X_test, Y_train, Y_test)
reporter(Y_test, pred)
compare()
CLASSIFICATION REPORT
precision recall f1-score support
Disk Hernia 0.750 0.692 0.720 13
Normal 0.750 0.900 0.818 20
Spondylolisthesis 1.000 0.897 0.945 29
accuracy 0.855 62
macro avg 0.833 0.830 0.828 62
weighted avg 0.867 0.855 0.857 62
Once again Outlier deletion has not helped much
Based on the above studies,
we shall fix our option as
Standardised, SMOTE upsampled, Hyperparameter Tuned model
b. Clearly showcase improvement in performance achieved.
# let us compare the results
display(recall.loc[["Maiden","Tuned"]])
display(accuracy.loc[["Maiden","Tuned"]])
| Normal | Spondylolisthesis | Disk Hernia | |
|---|---|---|---|
| Maiden | 0.8 | 0.852941 | 0.750 |
| Tuned | 0.8 | 0.941176 | 0.875 |
| accuracy | |
|---|---|
| Maiden | 0.822581 |
| Tuned | 0.887097 |
rs_imp=((recall.loc["Tuned","Spondylolisthesis"]/recall.loc["Maiden","Spondylolisthesis"])-1)*100
print("Recall (Spondylolisthesis) : %.2f%% improved"%(rs_imp))
rd_imp=((recall.loc["Tuned","Disk Hernia"]/recall.loc["Maiden","Disk Hernia"])-1)*100
print("Recall (Disk Hernia) : %.2f%% improved"%(rd_imp))
a_imp=((accuracy.loc["Tuned"]/accuracy.loc["Maiden"])-1)*100
print("Accuracy : %.2f%% improved"%(a_imp))
Recall (Spondylolisthesis) : 10.34% improved Recall (Disk Hernia) : 16.67% improved Accuracy : 7.84% improved
c. Clearly state which parameters contributed most to improve model performance.
What could be the probable reason?
As mentioned towards end of the study -
Reasons: obviously standardisation & upsampling release weights inherently associated with features & records hence a noticable improvement in results were found
Project Objective:
Build a Machine Learning model to perform focused marketing by predicting the potential customers who will convert using the historical dataset.
a. Read both the Datasets ‘Data1’ and ‘Data 2’ as DataFrame and store them into two separate variables.
# read all files
d1=pd.read_csv("Data1.csv")
d2=pd.read_csv("Data2.csv")
#lets view samples from both dataset
display(d1.sample(5))
display(d2.sample(5))
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | |
|---|---|---|---|---|---|---|---|---|
| 2760 | 2761 | 32 | 7 | 49 | 94080 | 3 | 2.3 | 1 |
| 2420 | 2421 | 63 | 39 | 40 | 91304 | 1 | 0.8 | 1 |
| 3717 | 3718 | 61 | 37 | 73 | 94550 | 3 | 2.0 | 3 |
| 4172 | 4173 | 67 | 42 | 75 | 90041 | 4 | 0.1 | 2 |
| 3262 | 3263 | 44 | 19 | 85 | 90024 | 2 | 3.8 | 3 |
| ID | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|
| 2759 | 2760 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 3433 | 3434 | 0 | 1 | 0 | 0 | 0 | 0.0 |
| 2624 | 2625 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 2439 | 2440 | 144 | 0 | 0 | 0 | 1 | 0.0 |
| 862 | 863 | 101 | 0 | 0 | 0 | 0 | 0.0 |
b. Print shape and Column Names and DataTypes of both the Dataframes.
print("Shape of Data1:",d1.shape)
print("Shape of Data2:",d2.shape)
Shape of Data1: (5000, 8) Shape of Data2: (5000, 7)
print("Columns of Data1\n",list(d1.columns))
print("Columns of Data2\n",list(d2.columns))
Columns of Data1 ['ID', 'Age', 'CustomerSince', 'HighestSpend', 'ZipCode', 'HiddenScore', 'MonthlyAverageSpend', 'Level'] Columns of Data2 ['ID', 'Mortgage', 'Security', 'FixedDepositAccount', 'InternetBanking', 'CreditCard', 'LoanOnCard']
Both Data1 & Data2 to be considered together as a single data base,
merged based on ID column, as both has different attributes
print("Data1 datatypes\n",d1.dtypes)
print("\nData2 datatypes\n",d2.dtypes)
Data1 datatypes ID int64 Age int64 CustomerSince int64 HighestSpend int64 ZipCode int64 HiddenScore int64 MonthlyAverageSpend float64 Level int64 dtype: object Data2 datatypes ID int64 Mortgage int64 Security int64 FixedDepositAccount int64 InternetBanking int64 CreditCard int64 LoanOnCard float64 dtype: object
features like ZipCode is stored in numeric type, to be changed to object (categorical) type
yet others to be studied further
aparently, there is no output feature. lets study further
c. Merge both the Dataframes on ‘ID’ feature to form a single DataFrame
bank=d1.merge(d2,on='ID',how='inner') #inner merge to eliminate non matching records
bank.shape # verify merge
(5000, 14)
bank.columns # check columns
Index(['ID', 'Age', 'CustomerSince', 'HighestSpend', 'ZipCode', 'HiddenScore',
'MonthlyAverageSpend', 'Level', 'Mortgage', 'Security',
'FixedDepositAccount', 'InternetBanking', 'CreditCard', 'LoanOnCard'],
dtype='object')
bank.sample(5) # visualise samples
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2222 | 2223 | 45 | 20 | 41 | 95008 | 1 | 0.3 | 1 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4792 | 4793 | 36 | 10 | 28 | 90840 | 4 | 1.0 | 1 | 130 | 0 | 0 | 1 | 0 | 0.0 |
| 3647 | 3648 | 41 | 14 | 32 | 91605 | 3 | 1.0 | 2 | 0 | 0 | 0 | 0 | 1 | 0.0 |
| 3607 | 3608 | 41 | 15 | 62 | 90401 | 3 | 0.9 | 3 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 4323 | 4324 | 52 | 28 | 31 | 92008 | 4 | 0.9 | 2 | 151 | 1 | 0 | 1 | 0 | 0.0 |
d. Change Datatype of below features to ‘Object ‘CreditCard’, ‘InternetBanking’, ‘FixedDepositAccount’, ‘Security’, ‘Level’, ‘HiddenScore’.
for col in ['CreditCard', 'InternetBanking', 'FixedDepositAccount', 'Security', 'Level', 'HiddenScore']:
bank[col]=bank[col].astype('object')
bank.dtypes
ID int64 Age int64 CustomerSince int64 HighestSpend int64 ZipCode int64 HiddenScore object MonthlyAverageSpend float64 Level object Mortgage int64 Security object FixedDepositAccount object InternetBanking object CreditCard object LoanOnCard float64 dtype: object
a. Visualize distribution of Target variable ‘LoanOnCard’ and clearly share insights.
fig=px.histogram(x=bank.LoanOnCard,height=300)
fig.data[0].text=list(bank.LoanOnCard.value_counts())
fig.show()
less than 11% of customers have loan on their credit card
while the target variable is heavily imbalanced, still there is scope to indentify potential borrowers
b. Check the percentage of missing values and impute if required.
# nulls counter
def nulsCount(df):
d2=pd.DataFrame(columns=["NULL","NAN"])
d2["NULL"] = df.isnull().sum().astype('uint32')
d2["NAN"]=df.isna().sum().astype('uint32')
d2=d2.loc[(d2["NULL"]!=0) | (d2["NAN"]!=0)]
if d2.shape[0]==0:
print("no NULLs/NANs")
return
else:
display(d2)
return d2.sum()
nu=nulsCount(bank)
print("percentage missing\n",nu*100/bank.shape[0])
| NULL | NAN | |
|---|---|---|
| LoanOnCard | 20 | 20 |
percentage missing NULL 0.4 NAN 0.4 dtype: float64
# check null vs nan entry indices
list(bank.loc[bank.LoanOnCard.isna()].index)==list(bank.loc[bank.LoanOnCard.isnull()].index)
True
# both null & nan entries are the same
# review the nan records
bank.loc[bank.LoanOnCard.isna()]
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 1 | 0 | 0 | 0 | NaN |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 1 | 0 | 0 | 0 | NaN |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | NaN |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 1 | NaN |
| 5 | 6 | 37 | 13 | 29 | 92121 | 4 | 0.4 | 2 | 155 | 0 | 0 | 1 | 0 | NaN |
| 6 | 7 | 53 | 27 | 72 | 91711 | 2 | 1.5 | 2 | 0 | 0 | 0 | 1 | 0 | NaN |
| 7 | 8 | 50 | 24 | 22 | 93943 | 1 | 0.3 | 3 | 0 | 0 | 0 | 0 | 1 | NaN |
| 8 | 9 | 35 | 10 | 81 | 90089 | 3 | 0.6 | 2 | 104 | 0 | 0 | 1 | 0 | NaN |
| 79 | 80 | 50 | 26 | 19 | 94720 | 2 | 0.4 | 1 | 118 | 0 | 0 | 1 | 0 | NaN |
| 80 | 81 | 60 | 36 | 41 | 95134 | 4 | 1.3 | 1 | 174 | 0 | 0 | 1 | 1 | NaN |
| 81 | 82 | 47 | 22 | 40 | 94612 | 3 | 2.7 | 2 | 0 | 0 | 0 | 1 | 0 | NaN |
| 82 | 83 | 41 | 16 | 82 | 92507 | 1 | 4.0 | 3 | 0 | 0 | 0 | 1 | 0 | NaN |
| 83 | 84 | 33 | 9 | 50 | 94305 | 1 | 2.4 | 2 | 0 | 0 | 0 | 0 | 0 | NaN |
| 84 | 85 | 46 | 22 | 18 | 91730 | 1 | 0.9 | 3 | 0 | 0 | 0 | 1 | 0 | NaN |
| 85 | 86 | 27 | 2 | 109 | 94005 | 4 | 1.8 | 3 | 0 | 0 | 0 | 0 | 0 | NaN |
| 86 | 87 | 40 | 16 | 42 | 94501 | 4 | 2.2 | 2 | 126 | 0 | 0 | 0 | 0 | NaN |
| 87 | 88 | 48 | 22 | 78 | 94305 | 3 | 1.1 | 1 | 0 | 0 | 0 | 1 | 0 | NaN |
| 88 | 89 | 65 | 41 | 51 | 94117 | 2 | 1.1 | 1 | 0 | 0 | 0 | 1 | 0 | NaN |
| 89 | 90 | 25 | -1 | 113 | 94303 | 4 | 2.3 | 3 | 0 | 0 | 0 | 0 | 1 | NaN |
being the target variable and since only 0.4% of the dataset has missing values
let us drop the records
bank.drop(bank.loc[bank.LoanOnCard.isna()].index,inplace=True)
bank.reset_index(drop=True,inplace=True)
nulsCount(bank) # check nulls & nans
bank.shape # check shape
no NULLs/NANs
(4980, 14)
c. Check for unexpected values in each categorical variable and impute with best suitable value.
for col in bank.select_dtypes(include='object').columns:
print(col)
display(bank[col].value_counts())
HiddenScore
1 1466 2 1293 4 1215 3 1006 Name: HiddenScore, dtype: int64
Level
1 2089 3 1496 2 1395 Name: Level, dtype: int64
Security
0 4460 1 520 Name: Security, dtype: int64
FixedDepositAccount
0 4678 1 302 Name: FixedDepositAccount, dtype: int64
InternetBanking
1 2974 0 2006 Name: InternetBanking, dtype: int64
CreditCard
0 3514 1 1466 Name: CreditCard, dtype: int64
no unxpected values in categorical (object type) columns
All values are numeric from 0 to 4 only
Note: more that 2 categories in features, ensure to use dummies on LEVEL & HIDDENSCORE fields
Lets also check numerical columns for unexpected values
for col in bank.select_dtypes(exclude='object').columns:
print(col)
display(bank[col].value_counts())
ID
10 1
3339 1
3346 1
3345 1
3344 1
..
1679 1
1678 1
1677 1
1676 1
5000 1
Name: ID, Length: 4980, dtype: int64
Age
43 149 35 148 52 145 54 143 58 143 50 136 30 136 56 135 41 135 34 134 57 132 39 132 59 132 51 129 45 126 46 126 42 126 60 126 55 125 31 125 40 124 29 123 62 123 61 122 44 121 32 120 33 119 48 117 49 115 38 115 47 112 53 111 63 108 36 107 37 105 28 103 27 90 65 79 26 78 64 78 25 51 24 28 66 24 67 12 23 12 Name: Age, dtype: int64
CustomerSince
32 154 20 148 5 146 9 145 23 144 35 143 25 142 28 138 18 137 19 134 26 133 24 130 3 129 14 127 30 126 34 125 16 125 17 125 27 124 29 124 22 121 7 121 6 119 8 118 15 118 33 117 10 117 37 116 13 116 11 116 4 113 36 113 21 113 31 104 12 102 38 88 39 85 2 84 1 73 0 66 40 57 41 42 -1 32 -2 15 42 8 -3 4 43 3 Name: CustomerSince, dtype: int64
HighestSpend
44 85
38 84
81 82
39 81
41 81
..
202 2
203 2
189 2
224 1
218 1
Name: HighestSpend, Length: 162, dtype: int64
ZipCode
94720 167
94305 125
95616 116
90095 71
93106 57
...
91024 1
94087 1
96145 1
9307 1
94598 1
Name: ZipCode, Length: 467, dtype: int64
MonthlyAverageSpend
0.30 240
1.00 229
0.20 204
2.00 188
0.80 187
...
8.90 1
3.67 1
4.67 1
3.25 1
2.75 1
Name: MonthlyAverageSpend, Length: 108, dtype: int64
Mortgage
0 3447
98 17
103 16
89 16
119 16
...
547 1
458 1
505 1
361 1
541 1
Name: Mortgage, Length: 347, dtype: int64
LoanOnCard
0.0 4500 1.0 480 Name: LoanOnCard, dtype: int64
bank.loc[bank["CustomerSince"]<0]
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 206 | 227 | 24 | -1 | 39 | 94085 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 295 | 316 | 24 | -2 | 51 | 90630 | 3 | 0.30 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 431 | 452 | 28 | -2 | 48 | 94132 | 2 | 1.75 | 3 | 89 | 0 | 0 | 1 | 0 | 0.0 |
| 504 | 525 | 24 | -1 | 75 | 93014 | 4 | 0.20 | 1 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 516 | 537 | 25 | -1 | 43 | 92173 | 3 | 2.40 | 2 | 176 | 0 | 0 | 1 | 0 | 0.0 |
| 520 | 541 | 25 | -1 | 109 | 94010 | 4 | 2.30 | 3 | 314 | 0 | 0 | 1 | 0 | 0.0 |
| 556 | 577 | 25 | -1 | 48 | 92870 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0.0 |
| 563 | 584 | 24 | -1 | 38 | 95045 | 2 | 1.70 | 2 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 577 | 598 | 24 | -2 | 125 | 92835 | 2 | 7.20 | 1 | 0 | 1 | 0 | 0 | 1 | 0.0 |
| 629 | 650 | 25 | -1 | 82 | 92677 | 4 | 2.10 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 650 | 671 | 23 | -1 | 61 | 92374 | 4 | 2.60 | 1 | 239 | 0 | 0 | 1 | 0 | 0.0 |
| 666 | 687 | 24 | -1 | 38 | 92612 | 4 | 0.60 | 2 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 773 | 794 | 24 | -2 | 150 | 94720 | 2 | 2.00 | 1 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 869 | 890 | 24 | -2 | 82 | 91103 | 2 | 1.60 | 3 | 0 | 0 | 0 | 1 | 1 | 0.0 |
| 889 | 910 | 23 | -1 | 149 | 91709 | 1 | 6.33 | 1 | 305 | 0 | 0 | 0 | 1 | 0.0 |
| 1153 | 1174 | 24 | -1 | 35 | 94305 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 1408 | 1429 | 25 | -1 | 21 | 94583 | 4 | 0.40 | 1 | 90 | 0 | 0 | 1 | 0 | 0.0 |
| 1502 | 1523 | 25 | -1 | 101 | 94720 | 4 | 2.30 | 3 | 256 | 0 | 0 | 0 | 1 | 0.0 |
| 1885 | 1906 | 25 | -1 | 112 | 92507 | 2 | 2.00 | 1 | 241 | 0 | 0 | 1 | 0 | 0.0 |
| 2082 | 2103 | 25 | -1 | 81 | 92647 | 2 | 1.60 | 3 | 0 | 0 | 0 | 1 | 1 | 0.0 |
| 2410 | 2431 | 23 | -1 | 73 | 92120 | 4 | 2.60 | 1 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 2446 | 2467 | 24 | -2 | 80 | 94105 | 2 | 1.60 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 2525 | 2546 | 25 | -1 | 39 | 94720 | 3 | 2.40 | 2 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 2598 | 2619 | 23 | -3 | 55 | 92704 | 3 | 2.40 | 2 | 145 | 0 | 0 | 1 | 0 | 0.0 |
| 2697 | 2718 | 23 | -2 | 45 | 95422 | 4 | 0.60 | 2 | 0 | 0 | 0 | 1 | 1 | 0.0 |
| 2828 | 2849 | 24 | -1 | 78 | 94720 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 2856 | 2877 | 24 | -2 | 80 | 91107 | 2 | 1.60 | 3 | 238 | 0 | 0 | 0 | 0 | 0.0 |
| 2942 | 2963 | 23 | -2 | 81 | 91711 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 2960 | 2981 | 25 | -1 | 53 | 94305 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 3056 | 3077 | 29 | -1 | 62 | 92672 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 1 | 0.0 |
| 3110 | 3131 | 23 | -2 | 82 | 92152 | 2 | 1.80 | 2 | 0 | 1 | 0 | 0 | 1 | 0.0 |
| 3137 | 3158 | 23 | -1 | 13 | 94720 | 4 | 1.00 | 1 | 84 | 0 | 0 | 1 | 0 | 0.0 |
| 3259 | 3280 | 26 | -1 | 44 | 94901 | 1 | 2.00 | 2 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 3264 | 3285 | 25 | -1 | 101 | 95819 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0.0 |
| 3272 | 3293 | 25 | -1 | 13 | 95616 | 4 | 0.40 | 1 | 0 | 1 | 0 | 0 | 0 | 0.0 |
| 3374 | 3395 | 25 | -1 | 113 | 90089 | 4 | 2.10 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 3405 | 3426 | 23 | -1 | 12 | 91605 | 4 | 1.00 | 1 | 90 | 0 | 0 | 1 | 0 | 0.0 |
| 3606 | 3627 | 24 | -3 | 28 | 90089 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 3776 | 3797 | 24 | -2 | 50 | 94920 | 3 | 2.40 | 2 | 0 | 1 | 0 | 0 | 0 | 0.0 |
| 3804 | 3825 | 23 | -1 | 12 | 95064 | 4 | 1.00 | 1 | 0 | 1 | 0 | 0 | 1 | 0.0 |
| 3867 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 1 | 0 | 1 | 0 | 0.0 |
| 3926 | 3947 | 25 | -1 | 40 | 93117 | 3 | 2.40 | 2 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 3995 | 4016 | 25 | -1 | 139 | 93106 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 1 | 0.0 |
| 4068 | 4089 | 29 | -1 | 71 | 94801 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 4096 | 4117 | 24 | -2 | 135 | 90065 | 2 | 7.20 | 1 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4265 | 4286 | 23 | -3 | 149 | 93555 | 2 | 7.20 | 1 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4391 | 4412 | 23 | -2 | 75 | 90291 | 2 | 1.80 | 2 | 0 | 0 | 0 | 1 | 1 | 0.0 |
| 4461 | 4482 | 25 | -2 | 35 | 95045 | 4 | 1.00 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4494 | 4515 | 24 | -3 | 41 | 91768 | 4 | 1.00 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4562 | 4583 | 25 | -1 | 69 | 92691 | 3 | 0.30 | 3 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4937 | 4958 | 29 | -1 | 50 | 95842 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 1 | 0.0 |
bank.loc[bank["CustomerSince"]<0,"LoanOnCard"].value_counts()
0.0 51 Name: LoanOnCard, dtype: int64
given that all the unexpected records have no LoanOnCard
and since relation with bank could not be negative
also since there is already a large imbalance in data, let us drop these records
bank.drop(bank.loc[bank["CustomerSince"]<0].index,inplace=True)
bank.reset_index(drop=True,inplace=True)
bank.shape # check shape
(4929, 14)
a. Split data into X and Y.
# Dropping drop ID & ZipCode too
bank.drop(["ID","ZipCode"],axis=1,inplace=True)
bank.shape #check merge & drops
(4929, 12)
bank.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4929 entries, 0 to 4928 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 4929 non-null int64 1 CustomerSince 4929 non-null int64 2 HighestSpend 4929 non-null int64 3 HiddenScore 4929 non-null object 4 MonthlyAverageSpend 4929 non-null float64 5 Level 4929 non-null object 6 Mortgage 4929 non-null int64 7 Security 4929 non-null object 8 FixedDepositAccount 4929 non-null object 9 InternetBanking 4929 non-null object 10 CreditCard 4929 non-null object 11 LoanOnCard 4929 non-null float64 dtypes: float64(2), int64(4), object(6) memory usage: 462.2+ KB
definitely the features are of different scales, it is advicable to standardise the data
# rename as X & Y
X=bank.copy()
Y=bank.LoanOnCard.copy()
B. Split data into train and test. Keep 25% data reserved for testing.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.25, # testing data of 25%
random_state=129) # random seed
# C. Train a Supervised Learning Classification base model - Logistic Regression.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver="liblinear")
lr.fit(X_train, Y_train)
pred = lr.predict(X_test)
D. Print evaluation metrics for the model and clearly share insights
reporter(Y_test, pred)
CLASSIFICATION REPORT
precision recall f1-score support
0.0 1.000 1.000 1.000 1115
1.0 1.000 1.000 1.000 118
accuracy 1.000 1233
macro avg 1.000 1.000 1.000 1233
weighted avg 1.000 1.000 1.000 1233
Y_train.value_counts()
0.0 3334 1.0 362 Name: LoanOnCard, dtype: int64
Though the accuracy is 95.8%, classification of type 1 is poor
Class 1 being poorly imbalanced could be the reason
E. Balance the data using the right balancing technique.
F. Again train the same previous model on balanced data.
G. Print evaluation metrics and clearly share differences observed.
X_tr, X_test, Y_tr, Y_test = train_test_split(X, Y,
test_size=0.25, # testing data of 25%
random_state=129) # random seed
sm = SMOTE(sampling_strategy='minority', random_state=129)
X_train, Y_train = sm.fit_resample(X_tr,Y_tr)
lr.fit(X_train, Y_train)
pred = lr.predict(X_test)
reporter(Y_test, pred)
CLASSIFICATION REPORT
precision recall f1-score support
0.0 1.000 1.000 1.000 1115
1.0 1.000 1.000 1.000 118
accuracy 1.000 1233
macro avg 1.000 1.000 1.000 1233
weighted avg 1.000 1.000 1.000 1233
A. Train a base model each for SVM, KNN.
from sklearn import svm
clf = svm.SVC(gamma=0.025, C=3)
clf.fit(X_train , Y_train)
pred = clf.predict(X_test)
reporter(Y_test, pred)
CLASSIFICATION REPORT
precision recall f1-score support
0.0 0.959 0.977 0.968 1115
1.0 0.732 0.602 0.660 118
accuracy 0.941 1233
macro avg 0.845 0.789 0.814 1233
weighted avg 0.937 0.941 0.938 1233
knc = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' )
knc.fit(X_train , Y_train)
pred = knc.predict(X_test)
reporter(Y_test, pred)
CLASSIFICATION REPORT
precision recall f1-score support
0.0 0.977 0.891 0.932 1115
1.0 0.440 0.805 0.569 118
accuracy 0.883 1233
macro avg 0.709 0.848 0.751 1233
weighted avg 0.926 0.883 0.898 1233
k=np.arange(3,31,2,dtype=int)
w=['uniform', 'distance']
a=['ball_tree', 'kd_tree', 'brute']
l=np.arange(0,10,2,dtype=int)
p={'n_neighbors':k,'weights':w,'algorithm':a,'leaf_size':l}
gs = GridSearchCV(estimator=knc, param_grid=p)
gs.fit(X_train, Y_train)
GridSearchCV(estimator=KNeighborsClassifier(weights='distance'),
param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': array([0, 2, 4, 6, 8]),
'n_neighbors': array([ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]),
'weights': ['uniform', 'distance']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=KNeighborsClassifier(weights='distance'),
param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': array([0, 2, 4, 6, 8]),
'n_neighbors': array([ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]),
'weights': ['uniform', 'distance']})KNeighborsClassifier(weights='distance')
KNeighborsClassifier(weights='distance')
knc = KNeighborsClassifier(algorithm=gs.best_params_.get('algorithm'),
leaf_size=gs.best_params_.get('leaf_size'),
n_neighbors=gs.best_params_.get('n_neighbors'),
weights=gs.best_params_.get('weights'))
knc.fit(X_train , Y_train)
pred=knc.predict(X_test)
reporter(Y_test, pred)
CLASSIFICATION REPORT
precision recall f1-score support
0.0 0.972 0.905 0.937 1115
1.0 0.456 0.754 0.569 118
accuracy 0.891 1233
macro avg 0.714 0.830 0.753 1233
weighted avg 0.923 0.891 0.902 1233
p={'C': [0.1, 1, 10, 100],
'gamma': [1, 0.1, 0.01, 0.001],
'kernel': ['rbf']}
gs = GridSearchCV(estimator=clf, param_grid=p)
gs.fit(X_train, Y_train)
GridSearchCV(estimator=SVC(C=3, gamma=0.025),
param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
'kernel': ['rbf']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(C=3, gamma=0.025),
param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
'kernel': ['rbf']})SVC(C=3, gamma=0.025)
SVC(C=3, gamma=0.025)
clf = svm.SVC(gamma=gs.best_params_.get('gamma'),
C=gs.best_params_.get('C'),
kernel=gs.best_params_.get('kernel'))
clf.fit(X_train , Y_train)
pred=knc.predict(X_test)
reporter(Y_test, pred)
CLASSIFICATION REPORT
precision recall f1-score support
0.0 0.972 0.905 0.937 1115
1.0 0.456 0.754 0.569 118
accuracy 0.891 1233
macro avg 0.714 0.830 0.753 1233
weighted avg 0.923 0.891 0.902 1233
accuracy improved after tuning from 88.9% of logisticRegression to 93.6% in KNN AND SVM